Documentation for the analysis of mouse placental RNA-seq data at e7.5, e8.5 and e9.5.

Load essential files + functions + libraries

tpm2 <- read.table("/work/LAS/geetu-lab/hhvu/project1_2/rna-seq/PlacentaRNA-seq/Files/tpmForClustering.txt", header = T)

summ <- function(trans_cluster, tpm2=tpm2) {
  tpm3 <- tpm2
  tpm3 <- cbind(rownames(tpm2), tpm3)
  rownames(tpm3) <- 1:nrow(tpm3)
  colnames(tpm3)[1] <- "transcripts"
  
  #e7.5
  e7.5_table <- data.frame(tpm3$transcripts, rowMeans(tpm3[,c("E7.5_2", "E7.5_3", "E7.5_4", "E7.5_5", "E7.5_6")]))
  e7.5_table$time <- c("e7.5")
  rownames(e7.5_table) <- 1:nrow(e7.5_table)
  colnames(e7.5_table) <- c("transcripts", "mean_cts_scaled", "time")
  
  #e8.5
  e8.5_table <- data.frame(tpm3$transcripts, rowMeans(tpm3[,c("E8.5_1", "E8.5_2", "E8.5_3", "E8.5_4", "E8.5_5", "E8.5_6")]))
  e8.5_table$time <- c("e8.5")
  rownames(e8.5_table) <- 1:nrow(e8.5_table)
  colnames(e8.5_table) <- c("transcripts", "mean_cts_scaled", "time")
  
  #e9.5
  e9.5_table <- data.frame(tpm3$transcripts, rowMeans(tpm3[,c("E9.5_1", "E9.5_2", "E9.5_3", "E9.5_4", "E9.5_5")]))
  e9.5_table$time <- c("e9.5")
  rownames(e9.5_table) <- 1:nrow(e9.5_table)
  colnames(e9.5_table) <- c("transcripts", "mean_cts_scaled", "time")
  
  summary <- rbind(rbind(e7.5_table, e8.5_table), e9.5_table)
  summary <- summary[order(summary$transcripts),]
  summary <- inner_join(summary, trans_cluster, by = c("transcripts" = "name"))
  summary <- inner_join(summary, t2g, by = c("transcripts" = "target_id"))
  
  return(summary)
}

plotClus <- function(summary, title){
  ascl2 <- subset(summary, summary$ext_gene == "Ascl2") #e7.5
  gjb5 <- subset(summary, summary$ext_gene == "Gjb5") #e7.5
  dnmt1 <- subset(summary, summary$ext_gene == "Dnmt1") #e8.5
  itga4 <- subset(summary, summary$ext_gene == "Itga4") #e8.5
  gjb2 <- subset(summary, summary$ext_gene == "Gjb2") #e9.5
  igf2 <- subset(summary, summary$ext_gene == "Igf2") #e9.5
  p <- ggplot(aes(time, mean_cts_scaled), data = summary) +
    geom_line(aes(group = transcripts), alpha = 0.5, colour = "grey77") +
    geom_line(stat = "summary", fun = "median", size = 2,
              aes(group = 1, color = "Group median")) +
    labs(title = title,
         x = "Time point",
         y = "Scaled mean transcript counts", color = "Legend", linetype = "Legend") +
    theme(plot.title = element_text(size = 15, face = "bold"), legend.text=element_text(size=20)) +
    geom_line(data = ascl2, size = 2.5, aes(group = transcripts, color = "Ascl2", linetype = "Ascl2"), alpha = 1) + #e7.5
    geom_line(data = gjb5, size = 2, aes(group = transcripts, color = "Gjb5", linetype = "Gjb5" ), alpha = 1) + #e7.5
    geom_line(data = dnmt1, size = 2, aes(group = transcripts, color = "Dnmt1", linetype = "Dnmt1"), alpha = 1) + #e8.5
    geom_line(data = itga4, size = 2, aes(group = transcripts, color = "Itga4", linetype = "Itga4"), alpha = 1) + #e8.5
    geom_line(data = gjb2, size = 2, aes(group = transcripts, color = "Gjb2", linetype = "Gjb2"), alpha = 1) + #e9.5
    geom_line(data = igf2, size = 2, aes(group = transcripts, color = "Igf2", linetype = "Igf2"), alpha = 1) + #e9.5
    scale_color_manual(name = "Legend", values = c("Ascl2" = "darkolivegreen4", "Gjb5" = "yellow3",
                                                   "Dnmt1" = "dodgerblue4", "Itga4" = "deepskyblue4",
                                                   "Gjb2" = "saddlebrown", "Igf2" = "salmon3",
                                                   "Group median" = "grey22")) + 
    scale_linetype_manual(name = "Legend", values = c("Ascl2" = "solid", "Gjb5" = "solid",
                                                      "Dnmt1" = "twodash", "Itga4" = "solid",
                                                      "Gjb2" = "solid", "Igf2" = "solid",
                                                      "Group median" = "solid")) + 
    guides(linetype=F,
           colour=guide_legend(keywidth = 3, keyheight = 1)) +
    theme(text = element_text(size=15),
          legend.text=element_text(size=15),
          axis.title.x = element_text(size = 15),
          axis.title.y = element_text(size = 15),
          axis.text.x = element_text(angle=0, hjust=0.5, size = 25),
          axis.text.y = element_text(size = 15)) +
    facet_grid(cols = vars(value))
  return(p)
}

library("ggplot2", suppressMessages())
library("dplyr", suppressMessages())
library("tidyverse", suppressMessages())

t2g <- read.table("/work/LAS/geetu-lab/hhvu/project1_2/rna-seq/PlacentaRNA-seq/Files/t2g.txt", header = T, sep = "\t")
t2g <- t2g[order(t2g$target_id),]
coding <- read.table("/work/LAS/geetu-lab/hhvu/project1_2/rna-seq/PlacentaRNA-seq/Files/Mus_musculus_grcm38_coding_transcripts.txt", header = F)

hc <- hclust(dist(tpm2, method = "euclidean"), "complete")
dend <- as.dendrogram(hc)

library("clusterProfiler")
library("org.Mm.eg.db")

go <- function(subnetworkGenes) {
  GO <- enrichGO(gene = subnetworkGenes, OrgDb=org.Mm.eg.db, ont = "BP", qvalueCutoff = 0.05, maxGSSize=1000, readable = T, keyType = "ENSEMBL")
  goSimplify <- GO
  simplify2 <- as.data.frame(goSimplify)
  simplify2$GeneRatio <- sapply(strsplit(simplify2$GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
  simplify2$BgRatio <- sapply(strsplit(simplify2$BgRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
  simplify2$Fold <- as.numeric(simplify2$GeneRatio)/as.numeric(simplify2$BgRatio)
  simplify2 <- simplify2[simplify2$qvalue <= 0.05 & simplify2$Fold >= 2 & simplify2$Count >= 5,]
  
  return(simplify2)
}

K = 3

trans_cluster <- cutree(hc, k = 3) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3 
#> 8091 7238 8242
summK3 <- summ(trans_cluster, tpm2)
p <- plotClus(summK3, "Hierarchical Clustering of Transcripts, k = 3")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


g3 <- unique(summK3[summK3$value == "3", "ens_gene"])
length(g3)
#> [1] 5566
g3go <- go(g3)

K = 4

trans_cluster <- cutree(hc, k = 4) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4 
#> 8091 7238 4501 3741
summK4 <- summ(trans_cluster, tpm2)
p <- plotClus(summK4, "Hierarchical Clustering of Transcripts, k = 4")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


g4 <- unique(summK4[summK4$value == "4", "ens_gene"])
length(g4)
#> [1] 3082
g4go <- go(g4)
#g4go$Description

K = 5

trans_cluster <- cutree(hc, k = 5) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4    5 
#> 8091 7238 4501 2532 1209
summK5 <- summ(trans_cluster, tpm2)
p <- plotClus(summK5, "Hierarchical Clustering of Transcripts, k = 5")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


g4 <- unique(summK5[summK5$value == "4", "ens_gene"])
length(g4)
#> [1] 2184
g4go <- go(g4)
#g4go$Description

g5 <- unique(summK5[summK5$value == "5", "ens_gene"])
length(g5)
#> [1] 1105
g5go <- go(g5)
#g5go$Description

K = 6

trans_cluster <- cutree(hc, k = 6) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4    5    6 
#> 4833 3258 7238 4501 2532 1209
summK6 <- summ(trans_cluster, tpm2)
p <- plotClus(summK6, "Hierarchical Clustering of Transcripts, k = 6")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p

K = 7

trans_cluster <- cutree(hc, k = 7) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4    5    6    7 
#> 4833 3258 7238 4501 1349 1209 1183
summK7 <- summ(trans_cluster, tpm2)
p <- plotClus(summK7, "Hierarchical Clustering of Transcripts, k = 7")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p

K = 8

trans_cluster <- cutree(hc, k = 8) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4    5    6    7    8 
#> 4833 3258 7238 2429 1349 1209 1183 2072
summK8 <- summ(trans_cluster, tpm2)
p <- plotClus(summK8, "Hierarchical Clustering of Transcripts, k = 8")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p

K = 9

trans_cluster <- cutree(hc, k = 9) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4    5    6    7    8    9 
#> 2838 3258 7238 2429 1349 1995 1209 1183 2072
summK9 <- summ(trans_cluster, tpm2)
p <- plotClus(summK9, "Hierarchical Clustering of Transcripts, k = 9")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p

K = 10

trans_cluster <- cutree(hc, k = 10) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3    4    5    6    7    8    9   10 
#> 2838 1173 7238 2429 1349 2085 1995 1209 1183 2072
summK10 <- summ(trans_cluster, tpm2)
p <- plotClus(summK10, "Hierarchical Clustering of Transcripts, k = 10")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p

Test the agreement between methods

K = 3 hierarchical clustering

trans_cluster <- cutree(hc, k = 3) %>% enframe()
table(trans_cluster$value)
#> 
#>    1    2    3 
#> 8091 7238 8242
summK3 <- summ(trans_cluster, tpm2)
p <- plotClus(summK3, "Hierarchical Clustering of Transcripts, k = 3")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


summK3$value <- ifelse(summK3$value == "1", "e8.5",
                        ifelse(summK3$value == "2", "e9.5",
                               ifelse(summK3$value == "3", "e7.5", summK3$value)))
summK3$value <- ifelse(summK3$value == "e8.5", "2",
                        ifelse(summK3$value == "e9.5", "3",
                               ifelse(summK3$value == "e7.5", "1", summK3$value)))

hc1 <- summK3[summK3$value == 1,]
hc2 <- summK3[summK3$value == 2,]
hc3 <- summK3[summK3$value == 3,]

K = 3 K-means

set.seed(123)
km <- kmeans(tpm2, centers = 3)
trans_cluster_kmeans <- km$cluster %>% enframe()
summKmeans <- summ(trans_cluster_kmeans, tpm2)
p <- plotClus(summKmeans, "K-means Clustering of Transcripts")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


summKmeans$value <- ifelse(summKmeans$value == "1", "e9.5",
                        ifelse(summKmeans$value == "2", "e7.5",
                               ifelse(summKmeans$value == "3", "e8.5", summKmeans$value)))
summKmeans$value <- ifelse(summKmeans$value == "e8.5", "2",
                        ifelse(summKmeans$value == "e9.5", "3",
                               ifelse(summKmeans$value == "e7.5", "1", summKmeans$value)))
summKmeans_hc1 <- summKmeans[summKmeans$transcripts %in% hc1$transcripts,]
summKmeans_hc2 <- summKmeans[summKmeans$transcripts %in% hc2$transcripts,]
summKmeans_hc3 <- summKmeans[summKmeans$transcripts %in% hc3$transcripts,]

library("fossil")
#> Warning: package 'maps' was built under R version 4.0.3
rand.index(as.numeric(summKmeans_hc1$value), as.numeric(hc1$value))
#> [1] 0.616173
rand.index(as.numeric(summKmeans_hc2$value), as.numeric(hc2$value))
#> [1] 0.6144555
rand.index(as.numeric(summKmeans_hc3$value), as.numeric(hc3$value))
#> [1] 0.7729758

SOM

trans_cluster_som <- read.table("Files/trans_cluster_som.txt", header = T)
summSOM <- summ(trans_cluster_som, tpm2)
p <- plotClus(summSOM, "Self-organizing Maps of Transcripts")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


summSOM$value <- ifelse(summSOM$value == "1", "e8.5",
                        ifelse(summSOM$value == "2", "e9.5",
                               ifelse(summSOM$value == "3", "e7.5", summSOM$value)))
summSOM$value <- ifelse(summSOM$value == "e8.5", "2",
                        ifelse(summSOM$value == "e9.5", "3",
                               ifelse(summSOM$value == "e7.5", "1", summSOM$value)))
summSOM_hc1 <- summSOM[summSOM$transcripts %in% hc1$transcripts,]
summSOM_hc2 <- summSOM[summSOM$transcripts %in% hc2$transcripts,]
summSOM_hc3 <- summSOM[summSOM$transcripts %in% hc3$transcripts,]

rand.index(as.numeric(summSOM_hc1$value), as.numeric(hc1$value))
#> [1] 0.6110996
rand.index(as.numeric(summSOM_hc2$value), as.numeric(hc2$value))
#> [1] 0.6192385
rand.index(as.numeric(summSOM_hc3$value), as.numeric(hc3$value))
#> [1] 0.7752637

Spectral clustering

trans_cluster_sc <- read.table("Files/trans_cluster_sc.txt", header = T)
summSC <- summ(trans_cluster_sc, tpm2)
p <- plotClus(summSC, "Spectral Clustering of Transcripts")
#> Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
#> "none")` instead.
p


summSC$value <- ifelse(summSC$value == "1", "e7.5",
                        ifelse(summSC$value == "2", "e8.5",
                               ifelse(summSC$value == "3", "e9.5", summSC$value)))
summSC$value <- ifelse(summSC$value == "e8.5", "2",
                        ifelse(summSC$value == "e9.5", "3",
                               ifelse(summSC$value == "e7.5", "1", summSC$value)))
summSC_hc1 <- summSC[summSC$transcripts %in% hc1$transcripts,]
summSC_hc2 <- summSC[summSC$transcripts %in% hc2$transcripts,]
summSC_hc3 <- summSC[summSC$transcripts %in% hc3$transcripts,]

rand.index(as.numeric(summSC_hc1$value), as.numeric(hc1$value))
#> [1] 0.6183286
rand.index(as.numeric(summSC_hc2$value), as.numeric(hc2$value))
#> [1] 0.4985626
rand.index(as.numeric(summSC_hc3$value), as.numeric(hc3$value))
#> [1] 0.8725749